Simulations with a single SNP & different sample sizes

1 Setup

First, simulate genotype data for 11,000 subjects. Then, for each subject’s genotype, simulate replicate traits.

Code
library(magrittr)
Code
set.seed(2023-04-15)
n <- 101000
n_traits <- 10
n_snp <- 1
# simulate genotypes matrix
geno <- matrix(sample(c(0,1,2), n*n_snp, replace=TRUE, prob=c(0.25,0.5,0.25)), nrow=n, ncol=n_snp)
# simulate phenotype
beta <- 10
y <- as.numeric(geno %*% beta) %*% t(rep(1, n_traits)) + matrix(data = rnorm(n * n_traits), nrow = n, ncol = n_traits)
# prepare for splitting into training and testing
n_test <- 1000
# get test subject ids
test_ids <- sample(1:n, n_test, replace = FALSE)
# organize data
dat <- tibble::as_tibble(y) %>%
    dplyr::rename_with(function(x){ num <- stringr::str_extract(x, "[0-9]+")
                                    return(paste0("pheno", num))}
                        ) %>%
    dplyr::bind_cols(geno %>% tibble::as_tibble() %>% dplyr::rename(geno = 1)) %>%
    dplyr::mutate(id = 1:n) %>% # fix this when using more than one SNP
    dplyr::mutate(in_test_set = id %in% test_ids)
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
Code
# split into training and testing
training <- dat %>% dplyr::filter(!in_test_set)
testing <- dat %>% dplyr::filter(in_test_set)
testing2 <- testing %>% dplyr::mutate(fold = as.integer(NA))
# use all training with leave K out
alpha <- 0.1
n_folds <- 5
n_train <- nrow(training)
# partition training data into 5 folds
folds <- split(training$id, sample(rep(1:n_folds, length.out = n_train)))
training2_pre <- training %>% 
    dplyr::mutate(fold = id %>% purrr::map_int(~which(sapply(folds, function(x) . %in% x))))

Above, we specified the number of replicates for the simulations. We created 10 replicate traits for the same 1.01^{5} subjects. Note that each subject has only 1 SNPs.

2 python code, adapted from Barber’s code for JK+ paper

Code
def leastsq_minL2(X,Y,X1,tol=1e-8):
    uX,dX,vX = np.linalg.svd(X)
    rX = (dX>=dX[0]*tol).sum()
    betahat = (vX[:rX].T/dX[:rX]).dot(uX[:,:rX].T.dot(Y))
    return X1.dot(betahat)

def compute_PIs(X,Y,X1,alpha = 0.1,fit_muh_fun):
    n = len(Y)
    #n1 = X1.shape[0]
    n1 = len(X1)
    ###############################
    # CV+
    ###############################

    K = 5
    n_K = np.floor(n/K).astype(int)
    base_inds_to_delete = np.arange(n_K).astype(int)
    resids_LKO = np.zeros(n)
    muh_LKO_vals_testpoint = np.zeros((n,n1))
    for i in range(K):
        inds_to_delete = (base_inds_to_delete + n_K*i).astype(int)
        muh_vals_LKO = fit_muh_fun(np.delete(X,inds_to_delete),
                                    np.delete(Y,inds_to_delete),\
                                   np.r_[X[inds_to_delete],X1])
        resids_LKO[inds_to_delete] = np.abs(Y[inds_to_delete] - muh_vals_LKO[:n_K])
        for inner_K in range(n_K):
            muh_LKO_vals_testpoint[inds_to_delete[inner_K]] = muh_vals_LKO[n_K:]
    ind_Kq = (np.ceil((1-alpha)*(n+1))).astype(int)
    PIs_dict = {'CV+' : pd.DataFrame(\
                    np.c_[np.sort(muh_LKO_vals_testpoint.T - resids_LKO,axis=1).T[-ind_Kq], \
                        np.sort(muh_LKO_vals_testpoint.T + resids_LKO,axis=1).T[ind_Kq-1]],\
                           columns = ['lower','upper'])}

    return pd.concat(PIs_dict.values(), axis=1, keys=PIs_dict.keys())
Code
# simulation
d = 1

method_names = ['CV+']
n_vals = [10000]

Y_dic = dict.fromkeys(n_vals, 0)
X_dic = dict.fromkeys(n_vals, 0)


for n in n_vals:
    results = pd.DataFrame(columns = ['itrial','d','method','coverage','width'])
    Xa = np.zeros((n, ntrial))
    Ya = np.zeros((n, ntrial))
    for itrial in range(ntrial):
        #X = np.random.normal(size=(n,d))
        #Xa[:, itrial] = X
        
        #Y = X.dot(beta) + np.random.normal(size=n)
        #Ya[:, itrial] = Y
        
        # store X, Y for later use with R code
        # store as numpy arrays
        
        PIs = compute_PIs(X,Y,X1,alpha,leastsq_minL2)
        for method in method_names:
            coverage = ((PIs[method]['lower'] <= Y1)&(PIs[method]['upper'] >= Y1)).mean()
            width = (PIs[method]['upper'] - PIs[method]['lower']).mean()
            results.loc[len(results)]=[itrial,d,method,coverage,width]
    X_dic[n] = Xa
    Y_dic[n] = Ya
Code
tictoc::tic() # timing
tl <- list()
n_per_fold_values <- c(20000, 2000, 200, 20)
for (n_per_fold in n_per_fold_values){
    training2 <- training2_pre %>%
        dplyr::group_by(fold) %>%
        dplyr::slice_sample(n = n_per_fold) %>%
        dplyr::ungroup()
    # store each trait's outputs
    out <- list()
    # loop over traits
    for (trait_num in 1:n_traits){
        tr2_one_trait <- training2 %>%
            dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
            dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
        te2_one_trait <- testing2 %>%
            dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
            dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
        
        # regress leaving one fold out
        preds <- list()
        for (fold_num in 1:n_folds) {
            # get training data
            train <- tr2_one_trait %>% dplyr::filter(fold != fold_num)
            # get testing data
            test <- tr2_one_trait %>% dplyr::filter(fold == fold_num)
            # fit model
            fit <- lm(pheno ~ geno, data = train)
            # predict
            foo <- test %>% dplyr::bind_rows(te2_one_trait)
            foo$pred <- predict(fit, newdata = foo)
            foo$fold_left_out <- fold_num
            result <- foo %>%
                dplyr::mutate(beta1_hat = coef(fit)[2],
                              beta0_hat = coef(fit)[1],
                              se_beta1_hat = summary(fit)$coefficients[2, 2],
                              se_beta0_hat = summary(fit)$coefficients[1, 2]
                )
            # save predictions
            preds[[fold_num]] <- result
        }
        # assemble predicted values
        # get absolute residuals
        preds_training <- preds %>%
            dplyr::bind_rows() %>%
            dplyr::filter(!is.na(fold)) %>% # keep only training data
            dplyr::mutate(absolute_residual = abs(pheno - pred)) %>%
            dplyr::select( - fold_left_out)
        preds_test <- preds %>%
            dplyr::bind_rows() %>%
            dplyr::filter(is.na(fold))
        # get indexes
        plus_index <- ceiling((1 - alpha) * (nrow(preds_training) + 1))
        minus_index <- floor(alpha * (nrow(preds_training) + 1))
    
        # go one by one through test set (testing2)
        test_list <- list()
        for (i in 1:nrow(testing2)){
            tt <- testing2[i, ]
            pt2 <- preds_test %>% 
                dplyr::filter(id == tt$id) %>% # our only use of tt
                dplyr::rename_with(function(x)paste0("test_", x)) 
                # pt2 contains the five predicted values for a single test subject
            nrow(pt2) # 5
            preds_all <- preds_training %>%
                dplyr::left_join(pt2, by = c("fold" = "test_fold_left_out")) %>%
                dplyr::mutate(test_fitted_plus_absolute_residual = test_pred + absolute_residual, 
                              test_fitted_minus_absolute_residual = test_pred - absolute_residual) 
            uu <- sort(preds_all$test_fitted_plus_absolute_residual)[plus_index]
            ll <- sort(preds_all$test_fitted_minus_absolute_residual)[minus_index]
            # make a tibble with exactly one row
            test_list[[i]] <- preds_all %>%
                dplyr::select(test_id, test_geno, test_pheno, test_beta1_hat, fold) %>%
                dplyr::mutate(lower = ll, upper = uu) %>%
                dplyr::distinct() %>%
                tidyr::pivot_wider(names_from = fold, 
                                    values_from = test_beta1_hat,
                                    names_prefix = "beta1_hat_fold_"
                                    )
        }
        test_tib <- test_list %>%
            dplyr::bind_rows() %>%
            dplyr::mutate(in_interval = test_pheno >= lower & test_pheno <= upper) %>%
            dplyr::mutate(interval_width = upper - lower) %>%
            dplyr::mutate(training_set_size = n_per_fold * n_folds,
                            trait_num = trait_num)
        out[[trait_num]] <- test_tib
    }
    tl[[as.character(n_per_fold * n_folds)]]  <- out
}
tictoc::toc() # timing
1170.879 sec elapsed

3 Organize results

Code
#test_tib_thin <- test_tib %>%
#    dplyr::select(test_id, test_geno)
tt_intermediate <- tl %>%
    dplyr::bind_rows(.id = "id") 
results <- tt_intermediate %>%
    dplyr::group_by(training_set_size, trait_num) %>%
    dplyr::summarise(mean_interval_width = mean(interval_width),
                    sd_interval_width = sd(interval_width),
                    mean_in_interval = mean(in_interval),
                    sd_in_interval = sd(in_interval), 
                    beta1_hat_fold_1 = mean(beta1_hat_fold_1),
                    beta1_hat_fold_2 = mean(beta1_hat_fold_2),
                    beta1_hat_fold_3 = mean(beta1_hat_fold_3),
                    beta1_hat_fold_4 = mean(beta1_hat_fold_4),
                    beta1_hat_fold_5 = mean(beta1_hat_fold_5),
                    median_interval_width = median(interval_width)
                    ) %>%
                    dplyr::ungroup() %>%
                    dplyr::mutate(mean_b1 = purrr::pmap_dbl(.l = list(beta1_hat_fold_1,
                                                                        beta1_hat_fold_2, 
                                                                        beta1_hat_fold_3, 
                                                                        beta1_hat_fold_4,
                                                                         beta1_hat_fold_5), 
                                                           .f = function(x, y, z, w, v) mean(c(x, y, z, w, v))),
                                    sd_b1 = purrr::pmap_dbl(.l = list(beta1_hat_fold_1,
                                                                        beta1_hat_fold_2, 
                                                                        beta1_hat_fold_3, 
                                                                        beta1_hat_fold_4,
                                                                         beta1_hat_fold_5), 
                                                           .f = function(x, y, z, w, v) sd(c(x, y, z, w, v)))
                    ) 
`summarise()` has grouped output by 'training_set_size'. You can override using
the `.groups` argument.
Code
results %>%
    knitr::kable() %>%
    print()
training_set_size trait_num mean_interval_width sd_interval_width mean_in_interval sd_in_interval beta1_hat_fold_1 beta1_hat_fold_2 beta1_hat_fold_3 beta1_hat_fold_4 beta1_hat_fold_5 median_interval_width mean_b1 sd_b1
1e+02 1 2.921635 0.0338406 0.861 0.3461196 10.144775 10.084259 10.156009 10.207762 10.229971 2.903374 10.164555 0.0571352
1e+02 2 3.400168 0.0479503 0.904 0.2947386 10.284413 10.433048 10.195788 10.309489 10.182274 3.417457 10.281002 0.1011801
1e+02 3 2.959576 0.0274067 0.838 0.3686352 10.258043 10.237098 10.357109 10.186041 10.323785 2.953241 10.272415 0.0684218
1e+02 4 3.465805 0.0161127 0.897 0.3041110 10.050247 9.973961 9.985624 10.006781 10.012632 3.465242 10.005849 0.0293424
1e+02 5 3.411780 0.0196258 0.903 0.2961059 9.929178 9.889154 9.955201 9.882220 9.811763 3.404434 9.893503 0.0545596
1e+02 6 3.318161 0.0126017 0.890 0.3130463 9.891345 9.965037 10.029101 10.111511 10.033860 3.310578 10.006171 0.0825624
1e+02 7 2.963591 0.0264555 0.859 0.3481957 9.934817 9.862027 10.012954 9.980182 10.072676 2.988351 9.972531 0.0796164
1e+02 8 3.537046 0.0350224 0.921 0.2698737 9.890595 10.055453 9.894569 9.909012 10.065810 3.557909 9.963088 0.0893833
1e+02 9 3.742438 0.0273368 0.930 0.2552747 10.061785 9.961189 10.062375 9.894782 9.905695 3.753650 9.977165 0.0815038
1e+02 10 3.585745 0.0167196 0.929 0.2569534 9.862102 10.040087 9.959794 9.940470 9.931375 3.577481 9.946766 0.0638810
1e+03 1 3.275217 0.0032393 0.896 0.3054133 9.957759 9.882612 9.911832 9.931689 9.958303 3.278435 9.928439 0.0321636
1e+03 2 3.257939 0.0036221 0.906 0.2919747 10.085727 10.049794 10.079084 10.028032 10.063635 3.257813 10.061254 0.0232245
1e+03 3 3.170237 0.0059241 0.875 0.3308844 10.060263 9.993658 9.984433 9.972919 10.002791 3.171126 10.002813 0.0339697
1e+03 4 3.111504 0.0093935 0.875 0.3308844 9.994867 9.924238 9.949674 9.949382 9.915245 3.118248 9.946681 0.0309438
1e+03 5 3.378520 0.0065462 0.908 0.2891706 10.040390 10.017083 10.010253 10.047849 10.075971 3.377119 10.038309 0.0262275
1e+03 6 3.286605 0.0060274 0.891 0.3117952 9.973149 10.027497 9.994283 9.918000 9.999980 3.289288 9.982582 0.0409780
1e+03 7 3.323791 0.0062989 0.896 0.3054133 9.998527 9.998433 9.920840 9.944078 10.007080 3.322136 9.973792 0.0387748
1e+03 8 3.292830 0.0075072 0.903 0.2961059 9.947854 9.981830 9.993932 9.926350 9.969340 3.295648 9.963861 0.0270208
1e+03 9 3.354618 0.0041783 0.895 0.3067068 10.019911 10.006189 10.023264 9.994405 10.041268 3.350776 10.017008 0.0177766
1e+03 10 3.198862 0.0075158 0.904 0.2947386 10.044854 10.013072 10.047176 10.038894 10.047386 3.200544 10.038276 0.0145011
1e+04 1 3.310541 0.0009650 0.902 0.2974634 10.021660 10.010814 10.014112 10.008762 10.016811 3.309692 10.014432 0.0050805
1e+04 2 3.286012 0.0002208 0.904 0.2947386 10.018995 10.023293 10.018539 10.024231 10.024176 3.285831 10.021847 0.0028405
1e+04 3 3.280503 0.0008335 0.888 0.3155243 10.015706 10.009537 9.989779 10.012863 10.016124 3.280568 10.008802 0.0109565
1e+04 4 3.278412 0.0014521 0.893 0.3092679 9.992840 10.004126 10.007997 10.019211 9.998729 3.279599 10.004580 0.0099749
1e+04 5 3.302901 0.0020362 0.904 0.2947386 10.001119 10.010678 10.016645 10.000905 10.017682 3.303808 10.009406 0.0081157
1e+04 6 3.279139 0.0009032 0.891 0.3117952 9.998677 10.007868 9.994740 9.992067 10.004279 3.279032 9.999526 0.0065467
1e+04 7 3.301598 0.0006366 0.891 0.3117952 9.996191 10.002808 9.990721 9.994745 10.005151 3.301535 9.997923 0.0059389
1e+04 8 3.298867 0.0003537 0.906 0.2919747 10.041291 10.011785 10.012316 10.031420 10.013665 3.298984 10.022096 0.0134950
1e+04 9 3.296364 0.0010892 0.886 0.3179703 9.983654 9.985410 9.996185 9.984194 9.982405 3.296888 9.986370 0.0055920
1e+04 10 3.235491 0.0009209 0.902 0.2974634 9.996320 10.006593 9.994737 10.005933 10.017067 3.235306 10.004130 0.0090265
1e+05 1 3.296623 0.0001264 0.902 0.2974634 10.002655 10.001333 10.002348 9.999124 9.999804 3.296726 10.001053 0.0015495
1e+05 2 3.279585 0.0003147 0.901 0.2988115 9.998164 9.991361 9.996980 9.997468 9.998482 3.279756 9.996491 0.0029270
1e+05 3 3.285824 0.0001588 0.888 0.3155243 10.000471 10.001378 10.000519 10.004000 10.002327 3.285673 10.001739 0.0014735
1e+05 4 3.290886 0.0003835 0.895 0.3067068 10.006763 10.002382 10.007280 10.004623 10.000640 3.290882 10.004338 0.0028336
1e+05 5 3.287503 0.0001293 0.902 0.2974634 10.008277 10.005476 10.006756 10.004284 10.010222 3.287517 10.007003 0.0023329
1e+05 6 3.292680 0.0002049 0.891 0.3117952 9.999753 10.003428 9.997483 10.002022 10.003029 3.292584 10.001143 0.0024942
1e+05 7 3.283640 0.0001915 0.891 0.3117952 10.008251 10.004705 10.002706 10.003063 10.004170 3.283757 10.004579 0.0022063
1e+05 8 3.280418 0.0004028 0.902 0.2974634 9.997292 9.998804 9.994012 9.991641 9.987772 3.280107 9.993904 0.0044219
1e+05 9 3.303658 0.0002511 0.890 0.3130463 10.006192 10.002125 10.003135 9.999696 10.007449 3.303787 10.003719 0.0031265
1e+05 10 3.287962 0.0001693 0.911 0.2848862 9.996853 9.992655 9.995233 9.993382 9.996725 3.287939 9.994970 0.0019088

4 Figures

4.1 Boxplots for interval width

Code
library(ggplot2)
tt_intermediate %>%
    ggplot(aes(y = interval_width, colour = as.factor(training_set_size), x = as.factor(trait_num)))  +
    geom_boxplot()

Code
#ggsave(here::here("figures", "interval_width_boxplot.png"), width = 10, height = 10)

4.2 Relationship between \(\hat\beta\) and median interval width

Code
p1 <- results %>%
    ggplot(aes(x = mean_b1, y = median_interval_width, colour = as.factor(training_set_size), replicate_num = trait_num)) +
    geom_point()
plotly::ggplotly(p1, tooltip = c("x", "y", "colour", "replicate_num"))

5 Session Info

Code
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.0 (2023-04-21)
 os       Ubuntu 18.04.6 LTS
 system   x86_64, linux-gnu
 ui       X11
 language en_US:
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Detroit
 date     2023-04-25
 pandoc   1.19.2.4 @ /usr/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date (UTC) lib source
 cli           3.6.1   2023-03-23 [1] CRAN (R 4.2.3)
 colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.2.2)
 crosstalk     1.2.0   2021-11-04 [1] CRAN (R 4.1.2)
 data.table    1.14.8  2023-02-17 [1] CRAN (R 4.2.2)
 digest        0.6.31  2022-12-11 [1] CRAN (R 4.2.2)
 dplyr         1.1.1   2023-03-22 [1] CRAN (R 4.2.3)
 ellipsis      0.3.2   2021-04-29 [2] CRAN (R 4.2.1)
 evaluate      0.20    2023-01-17 [1] CRAN (R 4.2.2)
 fansi         1.0.4   2023-01-22 [1] CRAN (R 4.2.2)
 farver        2.1.1   2022-07-06 [1] CRAN (R 4.2.3)
 fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.2.3)
 generics      0.1.3   2022-07-05 [1] CRAN (R 4.2.3)
 ggplot2     * 3.4.2   2023-04-03 [1] CRAN (R 4.2.3)
 glue          1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
 gtable        0.3.3   2023-03-21 [1] CRAN (R 4.2.3)
 htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.2.3)
 htmlwidgets   1.6.2   2023-03-17 [1] CRAN (R 4.2.2)
 httr          1.4.5   2023-02-24 [1] CRAN (R 4.2.3)
 jsonlite      1.8.4   2022-12-06 [1] CRAN (R 4.2.3)
 knitr         1.42    2023-01-25 [1] CRAN (R 4.2.3)
 labeling      0.4.2   2020-10-20 [2] CRAN (R 4.0.3)
 lattice       0.21-8  2023-04-05 [1] CRAN (R 4.2.3)
 lazyeval      0.2.2   2019-03-15 [2] CRAN (R 4.0.3)
 lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.2.2)
 magrittr    * 2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
 Matrix        1.5-4   2023-04-04 [1] CRAN (R 4.2.3)
 munsell       0.5.0   2018-06-12 [2] CRAN (R 4.0.3)
 pillar        1.9.0   2023-03-22 [1] CRAN (R 4.2.3)
 pkgconfig     2.0.3   2019-09-22 [2] CRAN (R 4.0.3)
 plotly        4.10.1  2022-11-07 [1] CRAN (R 4.2.2)
 png           0.1-8   2022-11-29 [1] CRAN (R 4.2.3)
 purrr         1.0.1   2023-01-10 [1] CRAN (R 4.2.2)
 R6            2.5.1   2021-08-19 [2] CRAN (R 4.1.1)
 Rcpp          1.0.10  2023-01-22 [1] CRAN (R 4.2.2)
 reticulate    1.28    2023-01-27 [1] CRAN (R 4.2.3)
 rlang         1.1.0   2023-03-14 [1] CRAN (R 4.2.2)
 rmarkdown     2.21    2023-03-26 [1] CRAN (R 4.2.3)
 scales        1.2.1   2022-08-20 [1] CRAN (R 4.2.3)
 sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.1.2)
 stringi       1.7.12  2023-01-11 [1] CRAN (R 4.2.2)
 stringr       1.5.0   2022-12-02 [1] CRAN (R 4.2.3)
 tibble        3.2.1   2023-03-20 [1] CRAN (R 4.2.3)
 tictoc        1.1     2022-09-03 [1] CRAN (R 4.2.1)
 tidyr         1.3.0   2023-01-24 [1] CRAN (R 4.2.3)
 tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.2.2)
 utf8          1.2.3   2023-01-31 [1] CRAN (R 4.2.3)
 vctrs         0.6.1   2023-03-22 [1] CRAN (R 4.2.3)
 viridisLite   0.4.1   2022-08-22 [1] CRAN (R 4.2.3)
 withr         2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
 xfun          0.38    2023-03-24 [1] CRAN (R 4.2.3)
 yaml          2.3.7   2023-01-23 [1] CRAN (R 4.2.3)

 [1] /net/mulan/home/fredboe/R/x86_64-pc-linux-gnu-library/4.0
 [2] /net/mario/cluster/lib/R/site-library-bionic-40
 [3] /usr/local/lib/R/site-library
 [4] /usr/lib/R/site-library
 [5] /usr/lib/R/library

──────────────────────────────────────────────────────────────────────────────
Code
# git commit info
gr <- git2r::repository(here::here()) %>%
    git2r::commits()
gr[[1]] 
[0944658] 2023-04-25: fix: deleted ggsave call for use on csg cluster